rm(list=ls())
knitr::opts_chunk$set(echo = TRUE)
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggbiplot))
suppressPackageStartupMessages(library(factoextra))
source("~/Dropbox/Documents/Github_mining/src/R/project_info.R")
repo_data_main <- load_repo_features(saved_repo_features_main)
repo_data_high_prof <- load_repo_features(saved_repo_features_high_prof)
Intuition for eigenvalues and eigenvectors
The proportion of the variation in the data variables that is explained by a PC is equal to that component’s associated eigenvalue divided by the sum of all eigenvalues.
An eigenvalue > 1 indicates that PCs account for more variance than accounted by one of the original variables in standardized data. This is commonly used as a cutoff point for which PCs are retained.
You can also limit the number of component to that number that accounts for a certain fraction of the total variance. For example, if you are satisfied with 80% of the total variance explained then use the number of components to achieve that.
Note that, a good dimension reduction is achieved when the the first few PCs account for a large proportion of the variability (80-90%).
The column labeled PC1 is the eigenvector of the data covariance matrix associated with the largest eigenvalue. Its elements are the coefficients or loadings of each original variable on the first PC. It matters if the loadings have opposite signs, but not which is positive and which is negative. The magnitudes of the loadings are also important.
From here:
PC scores: Also called component scores in PCA, these scores are the scores of each case (row) on each factor (column). To compute the factor score for a given case for a given factor, one takes the case’s standardized score on each variable, multiplies by the corresponding factor loading of the variable for the given factor, and sums these products.
# Combine repo data from both datasets
repo_data_all <- rbind(repo_data_high_prof %>% mutate(is_high_profile = TRUE),
repo_data_main %>% select(-contains("topic")) %>% mutate(is_high_profile = FALSE))
# Get numeric columns
repo_data_numeric <- repo_data_all[,sapply(repo_data_all, is.numeric)]
# Replace NA's by the column median
for(j in 1:ncol(repo_data_numeric)) {
if(is.numeric(repo_data_numeric[,j])){
med <- median(repo_data_numeric[,j], na.rm = TRUE)
for(i in 1:nrow(repo_data_numeric)) {
if(is.na(repo_data_numeric[i,j])) {
repo_data_numeric[i,j] <- med
}
}
}
}
# Apply transformations
repo_data_numeric <- data.frame(scale(repo_data_numeric, center = TRUE, scale = TRUE))
pca_res <- princomp(repo_data_numeric, cor = TRUE, scores = TRUE)
# Proportion of variance explained by each PC
prop_var <- (pca_res$sdev)^2 / sum(pca_res$sdev^2)
cumulative_prop_var <- cumsum(prop_var)
ncomp_min_prop_var <- function(cumulative_prop_var, p) {
for(i in 1:length(cumulative_prop_var)) {
if(cumulative_prop_var[i] > p) return(i)
}}
# Keep PCs that explain most of the variance
ncomp_keep <- ncomp_min_prop_var(cumulative_prop_var, 0.8)
# Number of components to keep
ncomp_keep
## [1] 43
# Proportion of variance explained
fviz_screeplot(pca_res, ncp = ncomp_keep)
# Eigenvalues
# Horizontal line for eigenvalue = 1
fviz_screeplot(pca_res, ncp = ncomp_keep, choice = "eigenvalue") + geom_hline(yintercept = 1, col = "red")
# Display the top n variables for each PC
for(j in 1:ncomp_keep) {
message(paste("PC", j))
coefs <- loadings(pca_res)[,j]
ord <- order(abs(coefs), decreasing = TRUE)
pr <- coefs[ord[1:20]]
print(data.frame(coef = pr))
}
## PC 1
## coef
## total_bytes_no_data_procedural -0.1276308
## total_lines_code_and_comment_no_data -0.1227921
## total_lines_comment_no_data -0.1216897
## total_bytes_no_data_imperative -0.1216747
## total_lines_comment -0.1212459
## total_lines_of_code_no_data -0.1211968
## total_lines_code_and_comment -0.1182910
## total_file_size_no_data -0.1181329
## total_bytes_no_data_compiled -0.1177452
## total_bytes_no_data_object_oriented -0.1147552
## total_lines_of_code -0.1142718
## total_files_procedural -0.1137651
## total_bytes_no_data_functional_impure -0.1127555
## total_bytes_no_data_compatibility_nominative -0.1124212
## total_bytes_no_data_type_system_static -0.1105369
## total_files_compiled -0.1102068
## total_files_imperative -0.1094048
## num_langs_test_cases_no_data -0.1090291
## num_langs -0.1083831
## total_files_object_oriented -0.1080914
## PC 2
## coef
## total_bytes_no_data_type_system_safe -0.1288695
## total_bytes_no_data_type_system_dynamic -0.1261959
## total_bytes_no_data_interpreted -0.1244512
## total_bytes_no_data_compatibility_duck -0.1236843
## bytes_JavaScript -0.1220937
## total_files_type_system_safe -0.1218642
## total_lines_code_and_comment_JavaScript -0.1196745
## total_lines_of_code_JavaScript -0.1184044
## total_files_type_system_dynamic -0.1142995
## mean_bytes_m4 0.1135142
## max_lines_code_and_comment_m4 0.1103312
## max_lines_code_m4 0.1102678
## total_lines_comment_JavaScript -0.1098781
## mean_lines_code_and_comment_m4 0.1081187
## mean_lines_code_m4 0.1078653
## total_files_interpreted -0.1068622
## total_files_compatibility_duck -0.1066592
## max_lines_code_and_comment_Bourne_Shell 0.1037132
## max_lines_code_Bourne_Shell 0.1035287
## pct_files_no_data_object_oriented -0.1027611
## PC 3
## coef
## pct_files_no_data_compatibility_duck 0.1560800
## pct_bytes_no_data_compatibility_duck 0.1532218
## pct_files_no_data_type_system_dynamic 0.1521585
## pct_files_no_data_interpreted 0.1511942
## pct_files_no_data_type_system_static -0.1500162
## pct_bytes_no_data_type_system_dynamic 0.1479193
## pct_bytes_no_data_interpreted 0.1470214
## pct_bytes_no_data_type_system_static -0.1462854
## pct_files_no_data_compatibility_nominative -0.1420657
## pct_bytes_no_data_compatibility_nominative -0.1411327
## pct_bytes_no_data_Java -0.1358078
## mean_bytes_per_line_code_and_comment_Java -0.1203335
## pct_bytes_no_data_R 0.1145398
## pct_lines_comment_Java -0.1141904
## total_lines_code_and_comment_Java -0.1115311
## max_lines_code_Java -0.1108032
## total_lines_of_code_Java -0.1098941
## max_lines_code_and_comment_Java -0.1092429
## bytes_Java -0.1088435
## total_lines_comment_Java -0.1083414
## PC 4
## coef
## pct_files_no_data_compiled -0.1825425
## pct_bytes_no_data_compiled -0.1691235
## pct_files_no_data_type_system_safe -0.1584043
## pct_bytes_no_data_type_system_safe -0.1528362
## pct_files_no_data_imperative -0.1503853
## pct_bytes_no_data_imperative -0.1409746
## pct_bytes_no_data_Python -0.1254706
## pct_files_no_data_array 0.1209297
## pct_bytes_no_data_array 0.1198673
## total_lines_comment_C_Cpp_Header 0.1177038
## total_lines_code_and_comment_C_Cpp_Header 0.1127964
## num_files_no_data 0.1107477
## num_files_C_Cpp_Header 0.1097586
## total_lines_of_code_C_Cpp_Header 0.1085011
## mean_bytes_per_line_code_and_comment_Python -0.1068548
## pct_files_no_data_type_system_unsafe 0.1037420
## pct_bytes_no_data_Java -0.1034078
## num_files 0.1015726
## pct_bytes_no_data_R 0.1012059
## bytes_C_Cpp_Header 0.1011909
## PC 5
## coef
## pct_bytes_no_data_R -0.1448219
## pct_files_no_data_functional_impure -0.1344128
## pct_bytes_no_data_functional_impure -0.1314789
## pct_bytes_no_data_array -0.1302764
## pct_files_no_data_array -0.1263007
## total_files_type_system_static -0.1221173
## pct_files_no_data_procedural -0.1148500
## total_bytes_no_data_type_system_static -0.1132654
## total_files_compiled -0.1113646
## pct_files_no_data_object_oriented -0.1067859
## total_files_compatibility_nominative -0.1064765
## pct_bytes_no_data_object_oriented -0.1057431
## num_files_Cpp -0.1044121
## pct_files_no_data_type_system_unsafe -0.1039217
## pct_lines_comment_R -0.1032556
## total_files_type_system_unsafe -0.1030084
## pct_files_no_data_compatibility_duck -0.1029847
## pct_bytes_no_data_procedural -0.1021319
## pct_bytes_no_data_compatibility_duck -0.1020712
## total_lines_comment_Cpp -0.1013010
## PC 6
## coef
## bytes_SQL -0.1603115
## total_bytes_no_data_declarative -0.1602020
## total_lines_code_and_comment_SQL -0.1597201
## total_lines_of_code_SQL -0.1596877
## total_lines_comment_SQL -0.1592722
## num_files_SQL -0.1578101
## max_lines_code_SQL -0.1577875
## total_files_declarative -0.1575398
## max_lines_code_and_comment_SQL -0.1574855
## pct_bytes_test_cases_procedural 0.1379559
## pct_bytes_test_cases_object_oriented 0.1374439
## pct_lines_in_test_cases_no_data 0.1349042
## pct_bytes_in_test_cases_no_data 0.1321775
## pct_bytes_test_cases_imperative 0.1233329
## pct_bytes_test_cases_compiled 0.1145024
## pct_bytes_test_cases_functional_impure 0.1139793
## pct_bytes_test_cases_interpreted 0.1069488
## pct_bytes_test_cases_type_system_safe 0.1058571
## total_bytes_test_cases_object_oriented 0.1045796
## pct_bytes_no_data_type_system_dynamic -0.0999194
## PC 7
## coef
## total_bytes_no_data_declarative -0.1687734
## bytes_SQL -0.1681989
## total_lines_comment_SQL -0.1681696
## total_lines_code_and_comment_SQL -0.1678177
## total_lines_of_code_SQL -0.1676471
## max_lines_code_and_comment_SQL -0.1659676
## max_lines_code_SQL -0.1656246
## total_files_declarative -0.1628827
## num_files_SQL -0.1618883
## bytes_MATLAB -0.1314321
## total_lines_of_code_MATLAB -0.1312552
## total_lines_code_and_comment_MATLAB -0.1310628
## pct_bytes_test_cases_functional_pure -0.1308329
## total_lines_comment_MATLAB -0.1273178
## total_bytes_no_data_array -0.1269834
## total_files_array -0.1262827
## num_files_MATLAB -0.1200883
## max_lines_code_and_comment_MATLAB -0.1168897
## commit_authors_no_gender -0.1161183
## num_days_new_files_added -0.1150610
## PC 8
## coef
## total_lines_code_and_comment_Bourne_Again_Shell -0.17578749
## max_lines_code_and_comment_Bourne_Again_Shell -0.17510805
## total_lines_of_code_Bourne_Again_Shell -0.17492393
## max_lines_code_Bourne_Again_Shell -0.17403430
## total_lines_comment_Bourne_Again_Shell -0.17192075
## bytes_Bourne_Again_Shell -0.16508086
## mean_lines_code_and_comment_Bourne_Again_Shell -0.16474090
## mean_lines_code_Bourne_Again_Shell -0.16118644
## mean_bytes_Bourne_Again_Shell -0.13569158
## total_lines_comment_C -0.11725312
## max_lines_code_and_comment_Python 0.11223789
## mean_bytes_per_line_code_and_comment_Python 0.10966338
## max_lines_code_Python 0.10606161
## max_lines_code_C_Cpp_Header 0.09994796
## pct_bytes_no_data_Python 0.09940012
## mean_bytes_Python 0.09938811
## mean_lines_code_and_comment_Python 0.09891870
## max_lines_code_and_comment_C_Cpp_Header 0.09827440
## total_bytes_test_cases_type_system_static -0.09773522
## max_lines_code_and_comment_Cpp 0.09704321
## PC 9
## coef
## total_lines_of_code_MATLAB -0.2046767
## total_lines_code_and_comment_MATLAB -0.2039742
## bytes_MATLAB -0.2035720
## total_lines_comment_MATLAB -0.1973735
## num_files_MATLAB -0.1846725
## max_lines_code_and_comment_MATLAB -0.1832486
## max_lines_code_MATLAB -0.1751083
## total_files_array -0.1697399
## total_bytes_no_data_array -0.1601974
## pct_bytes_no_data_MATLAB -0.1476883
## mean_lines_code_and_comment_MATLAB -0.1431981
## mean_bytes_per_line_code_and_comment_MATLAB -0.1431630
## pct_files_no_data_imperative -0.1405149
## mean_lines_code_MATLAB -0.1392645
## mean_bytes_MATLAB -0.1347921
## pct_lines_comment_MATLAB -0.1337496
## pct_bytes_no_data_Python -0.1304746
## pct_bytes_no_data_imperative -0.1288551
## mean_lines_code_and_comment_Python -0.1244374
## pct_files_no_data_type_system_unsafe 0.1226126
## PC 10
## coef
## bytes_PHP 0.1595617
## total_lines_of_code_PHP 0.1591604
## total_lines_code_and_comment_PHP 0.1587388
## total_lines_comment_PHP 0.1564654
## num_files_PHP 0.1562980
## max_lines_code_PHP 0.1309357
## max_lines_code_and_comment_PHP 0.1246946
## pct_bytes_no_data_PHP 0.1200420
## total_files_declarative 0.1166108
## num_files_SQL 0.1165952
## max_lines_code_and_comment_Perl -0.1153217
## max_lines_code_Perl -0.1152090
## total_lines_of_code_SQL 0.1143251
## total_lines_code_and_comment_SQL 0.1142399
## bytes_SQL 0.1136413
## total_bytes_no_data_declarative 0.1132854
## total_lines_comment_SQL 0.1132673
## pct_bytes_no_data_compatibility_nominative -0.1128934
## max_lines_code_SQL 0.1101682
## max_lines_code_and_comment_SQL 0.1096996
## PC 11
## coef
## max_lines_code_Ruby 0.28794091
## max_lines_code_and_comment_Ruby 0.28773649
## bytes_Ruby 0.28748905
## total_lines_code_and_comment_Ruby 0.28428503
## total_lines_of_code_Ruby 0.28222816
## num_files_Ruby 0.27883798
## total_lines_comment_Ruby 0.27300780
## pct_bytes_no_data_Ruby 0.19526333
## mean_bytes_per_line_code_and_comment_Ruby 0.13927621
## total_files_compatibility_duck 0.12597287
## total_files_type_system_dynamic 0.09540202
## mean_lines_code_and_comment_Ruby 0.09110303
## mean_lines_code_Ruby 0.09042934
## pct_files_no_data_procedural -0.08896505
## pct_bytes_no_data_procedural -0.08887075
## mean_bytes_Ruby 0.08531728
## total_files_interpreted 0.08254316
## max_lines_code_and_comment_no_data -0.08116563
## bytes_PHP -0.07939042
## max_lines_code_no_data -0.07701542
## PC 12
## coef
## total_lines_of_code_Perl 0.2094945
## total_lines_code_and_comment_Perl 0.2084690
## max_lines_code_Perl 0.2041377
## num_files_Perl 0.2024330
## max_lines_code_and_comment_Perl 0.2023879
## pct_bytes_no_data_Perl 0.2007708
## total_lines_comment_Perl 0.1994074
## bytes_Perl 0.1944453
## mean_bytes_per_line_code_and_comment_Perl 0.1587875
## mean_lines_code_and_comment_Perl 0.1577549
## mean_bytes_Perl 0.1541149
## mean_lines_code_Perl 0.1526362
## mean_lines_code_and_comment_no_data -0.1481061
## mean_lines_code_no_data -0.1477807
## mean_lines_code_and_comment -0.1444444
## mean_lines_code -0.1435089
## max_lines_code_and_comment -0.1395865
## max_lines_code_and_comment_no_data -0.1386585
## mean_file_size_no_data -0.1379620
## max_lines_code_no_data -0.1375359
## PC 13
## coef
## pct_bytes_no_data_Java 0.1288739
## total_lines_of_code_Bourne_Shell 0.1174483
## max_lines_code_Bourne_Shell 0.1169153
## max_lines_code_and_comment_Bourne_Shell 0.1167828
## bytes_Bourne_Again_Shell -0.1155027
## total_lines_code_and_comment_Bourne_Shell 0.1154875
## mean_lines_code_and_comment_Bourne_Shell 0.1152790
## total_lines_code_and_comment_Bourne_Again_Shell -0.1150371
## mean_bytes_Bourne_Shell 0.1146503
## total_lines_comment_Bourne_Again_Shell -0.1145521
## mean_lines_code_Bourne_Shell 0.1145095
## total_lines_of_code_Bourne_Again_Shell -0.1139706
## bytes_Bourne_Shell 0.1122968
## pct_lines_comment_Java 0.1099680
## mean_bytes_per_line_code_and_comment_Java 0.1096952
## pct_files_no_data_type_system_static 0.1078512
## num_files_m4 0.1076396
## total_lines_comment_m4 0.1063677
## commit_authors -0.1047277
## max_lines_code_and_comment_Bourne_Again_Shell -0.1024884
## PC 14
## coef
## mean_lines_code_and_comment_no_data 0.13137172
## mean_lines_code_and_comment 0.12881710
## mean_lines_code_no_data 0.12589387
## mean_lines_code 0.12283369
## max_lines_code_and_comment_Ruby 0.12185156
## max_lines_code_Ruby 0.12185138
## total_bytes_no_data_compatibility_duck -0.11029549
## num_files_JavaScript -0.10943755
## max_lines_code_Perl 0.10847442
## total_lines_code_and_comment_JavaScript -0.10769655
## mean_file_size_no_data 0.10749119
## total_lines_of_code_JavaScript -0.10746903
## max_lines_code_and_comment_Perl 0.10741917
## pct_bytes_test_cases_object_oriented 0.10523690
## pct_bytes_test_cases_interpreted 0.10147052
## mean_bytes_Perl 0.10136976
## mean_lines_code_and_comment_Perl 0.10071275
## pct_bytes_test_cases_functional_impure 0.09895874
## mean_lines_code_Perl 0.09884155
## pct_bytes_test_cases_compatibility_duck 0.09787587
## PC 15
## coef
## bytes_C_Cpp_Header -0.1265870
## pct_bytes_no_data_type_system_unsafe 0.1264224
## mean_lines_code_C 0.1263978
## total_lines_code_and_comment_Python 0.1262095
## total_lines_of_code_Python 0.1254312
## max_lines_code_and_comment_C 0.1251040
## total_lines_comment_Python 0.1249964
## bytes_Python 0.1234337
## num_files_Python 0.1211313
## mean_lines_code_and_comment_C 0.1209729
## total_lines_of_code_C_Cpp_Header -0.1201117
## pct_bytes_no_data_compiled 0.1197629
## pct_bytes_no_data_C 0.1197047
## mean_bytes_C 0.1171369
## total_lines_code_and_comment_C_Cpp_Header -0.1169604
## num_files_C_Cpp_Header -0.1159057
## mean_bytes_per_line_code_and_comment_C_Cpp_Header 0.1150983
## mean_bytes_per_line_code_and_comment_C 0.1141447
## max_lines_code_C 0.1102222
## pct_bytes_no_data_Cpp 0.1076800
## PC 16
## coef
## bytes_C_Cpp_Header 0.1413859
## commit_authors_male -0.1285657
## total_lines_of_code_C_Cpp_Header 0.1279808
## num_files_Cpp -0.1266416
## bytes_PHP -0.1239111
## commit_authors -0.1224591
## total_lines_code_and_comment_Cpp -0.1222927
## total_lines_of_code_Cpp -0.1218565
## total_lines_comment_Cpp -0.1218335
## num_files_C_Cpp_Header 0.1211502
## total_lines_code_and_comment_PHP -0.1187777
## total_lines_of_code_PHP -0.1187686
## mean_new_files_per_day_with_new_files 0.1180444
## total_lines_comment_PHP -0.1177750
## total_lines_code_and_comment_C_Cpp_Header 0.1152143
## max_lines_code_PHP -0.1125345
## num_files_PHP -0.1115222
## total_files_type_system_unsafe -0.1100359
## max_lines_code_and_comment_PHP -0.1097612
## bytes_Cpp -0.1089323
## PC 17
## coef
## pct_files_no_data_logic -0.1731891
## total_files_logic -0.1712570
## pct_bytes_no_data_logic -0.1699602
## total_bytes_no_data_logic -0.1669657
## pct_bytes_no_data_declarative -0.1644858
## bytes_C_Cpp_Header 0.1638024
## total_lines_of_code_C_Cpp_Header 0.1599349
## num_files_C_Cpp_Header 0.1567673
## total_bytes_test_cases_logic -0.1556881
## pct_files_no_data_declarative -0.1549792
## total_lines_code_and_comment_C_Cpp_Header 0.1513034
## forks_count 0.1478956
## subscribers_count 0.1446753
## watchers_count 0.1433364
## stargazers_count 0.1433364
## commit_authors_male 0.1327361
## commit_authors 0.1295462
## total_lines_comment_C_Cpp_Header 0.1100994
## bytes_PHP 0.1051093
## commit_authors_no_gender 0.1036532
## PC 18
## coef
## total_bytes_no_data_logic -0.27117980
## pct_bytes_no_data_logic -0.26673049
## total_files_logic -0.26444794
## pct_files_no_data_logic -0.25998989
## total_bytes_test_cases_logic -0.24490609
## pct_bytes_no_data_declarative -0.24133523
## pct_files_no_data_declarative -0.22580749
## pct_bytes_no_data_object_oriented -0.10873023
## consecutive_months_no_new_files_added -0.10681401
## pct_bytes_no_data_functional_impure -0.10542656
## consecutive_months_no_commits -0.10198624
## commit_span_days -0.09490328
## pct_files_no_data_object_oriented -0.09152597
## pct_months_new_files_added 0.08613092
## mean_bytes_per_line_code_and_comment_MATLAB 0.08519153
## pct_files_no_data_functional_impure -0.08399541
## mean_lines_code_MATLAB 0.08387689
## mean_lines_code_and_comment_MATLAB 0.08329897
## pct_bytes_test_cases_declarative -0.08227450
## mean_bytes_MATLAB 0.08173162
## PC 19
## coef
## total_lines_of_code_Python -0.2091659
## total_lines_code_and_comment_Python -0.2089519
## bytes_Python -0.2079769
## total_lines_comment_Python -0.2019665
## num_files_Python -0.1870697
## pct_files_no_data_logic -0.1372176
## pct_bytes_no_data_logic -0.1353027
## total_bytes_no_data_logic -0.1288058
## num_files_JavaScript 0.1222123
## max_lines_code_Python -0.1210721
## max_lines_code_and_comment_Python -0.1209686
## total_lines_comment_JavaScript 0.1153241
## total_lines_code_and_comment_JavaScript 0.1151029
## total_files_logic -0.1146095
## watchers_count 0.1137620
## stargazers_count 0.1137620
## total_lines_of_code_JavaScript 0.1118237
## total_bytes_test_cases_logic -0.1111289
## pct_bytes_no_data_JavaScript 0.1106893
## pct_bytes_no_data_procedural 0.1070838
## PC 20
## coef
## mean_lines_code_C 0.1753989
## pct_bytes_no_data_C 0.1748107
## mean_lines_code_and_comment_C 0.1736898
## mean_bytes_C 0.1729926
## max_lines_code_and_comment_C 0.1713241
## max_lines_code_C 0.1647866
## pct_files_no_data_logic 0.1533043
## pct_bytes_no_data_logic 0.1517994
## total_bytes_no_data_logic 0.1464254
## total_files_logic 0.1438111
## mean_bytes_Cpp -0.1402967
## mean_lines_code_and_comment_Cpp -0.1392374
## watchers_count 0.1370817
## stargazers_count 0.1370817
## mean_lines_code_Cpp -0.1367879
## total_bytes_test_cases_logic 0.1347983
## pct_bytes_no_data_Cpp -0.1342564
## mean_bytes_per_line_code_and_comment_Cpp -0.1309554
## forks_count 0.1293262
## subscribers_count 0.1171088
## PC 21
## coef
## mean_bytes_JavaScript -0.1304911
## pct_files_no_data_procedural 0.1267511
## pct_lines_comment_JavaScript -0.1265050
## mean_lines_code_and_comment_no_data 0.1195147
## mean_lines_code_no_data 0.1182196
## max_lines_code_and_comment_Cpp 0.1175830
## pct_bytes_no_data_imperative 0.1163651
## total_files_compatibility_structural -0.1162719
## pct_bytes_no_data_procedural 0.1158719
## mean_lines_code_Perl 0.1158127
## max_lines_code_Cpp 0.1156713
## mean_lines_code_and_comment_Perl 0.1150973
## mean_bytes_Perl 0.1136723
## mean_lines_code_and_comment 0.1131873
## mean_lines_code 0.1115289
## pct_bytes_no_data_JavaScript -0.1104373
## mean_lines_code_and_comment_Cpp 0.1088651
## mean_lines_code_and_comment_JavaScript -0.1087886
## total_bytes_test_cases_compatibility_structural -0.1087170
## num_files_JavaScript 0.1079117
## PC 22
## coef
## mean_bytes_JavaScript 0.1623488
## total_lines_comment_Perl -0.1616739
## bytes_Perl -0.1514930
## mean_lines_code_and_comment_JavaScript 0.1471058
## total_lines_code_and_comment_Perl -0.1452632
## mean_lines_code_JavaScript 0.1414519
## total_lines_of_code_Perl -0.1384731
## total_lines_code_and_comment_Python 0.1368762
## total_lines_of_code_Python 0.1364775
## num_files_Python 0.1361560
## total_lines_comment_Python 0.1340855
## bytes_Python 0.1340143
## num_files_Perl -0.1292127
## mean_lines_code -0.1200868
## mean_lines_code_and_comment -0.1172208
## mean_bytes_MATLAB 0.1106038
## mean_lines_code_MATLAB 0.1104077
## mean_lines_code_and_comment_MATLAB 0.1096151
## total_files_compatibility_structural 0.1083588
## mean_lines_code_Python -0.1073290
## PC 23
## coef
## mean_lines_code_and_comment_MATLAB 0.1694504
## mean_lines_code_MATLAB 0.1664930
## mean_bytes_MATLAB 0.1658783
## mean_bytes_per_line_code_and_comment_MATLAB 0.1651162
## total_bytes_test_cases_compatibility_duck -0.1551043
## total_files_functional_pure -0.1498045
## total_bytes_no_data_functional_pure -0.1478842
## num_files_Perl 0.1441969
## pct_lines_comment_MATLAB 0.1420074
## mean_lines_code_Perl -0.1389710
## mean_lines_code_and_comment_Perl -0.1368041
## pct_bytes_no_data_MATLAB 0.1356177
## bytes_Perl 0.1355503
## mean_bytes_Perl -0.1354813
## total_lines_of_code_Perl 0.1348833
## total_lines_code_and_comment_Perl 0.1344322
## total_lines_comment_Perl 0.1292255
## pct_files_no_data_functional_pure -0.1290168
## total_bytes_test_cases_functional_pure -0.1272304
## total_bytes_test_cases_interpreted -0.1243680
## PC 24
## coef
## forks_count -0.2024822
## watchers_count -0.2011704
## stargazers_count -0.2011704
## total_lines_comment_Perl -0.1524145
## bytes_Perl -0.1505578
## subscribers_count -0.1483543
## total_lines_code_and_comment_Perl -0.1456720
## total_lines_of_code_Perl -0.1420709
## num_files_Perl -0.1347029
## mean_bytes_per_line_code_and_comment_Perl 0.1285004
## total_files_functional_pure -0.1269036
## total_bytes_no_data_functional_pure -0.1262661
## num_files_Java 0.1234245
## mean_bytes_Java -0.1210828
## mean_lines_code_Java -0.1195390
## bytes_Java 0.1190247
## total_lines_of_code_Java 0.1183912
## mean_lines_code_and_comment_C 0.1176253
## mean_bytes_C 0.1175120
## total_lines_code_and_comment_Java 0.1174786
## PC 25
## coef
## mean_bytes_SQL 0.2194527
## mean_lines_code_SQL 0.2081194
## mean_lines_code_and_comment_SQL 0.2051570
## pct_bytes_no_data_SQL 0.1423101
## pct_lines_comment_SQL 0.1379925
## pct_bytes_no_data_JavaScript -0.1362489
## mean_bytes_JavaScript -0.1353051
## total_lines_code_and_comment_R 0.1349216
## consecutive_months_no_new_files_added -0.1334242
## total_lines_of_code_R 0.1320191
## mean_lines_code_and_comment_JavaScript -0.1315816
## mean_lines_code_JavaScript -0.1302130
## bytes_R 0.1245428
## total_lines_comment_R 0.1219369
## max_lines_code_and_comment_R 0.1167297
## max_lines_code_R 0.1141266
## consecutive_months_no_commits -0.1111053
## pct_months_with_commits 0.1071757
## pct_months_new_files_added 0.1063677
## pct_bytes_test_cases_declarative 0.1040184
## PC 26
## coef
## paper_authors 0.40678270
## paper_authors_male 0.39700827
## paper_authors_female 0.39212517
## paper_authors_no_gender 0.38490639
## consecutive_months_no_commits -0.11518616
## consecutive_months_no_new_files_added -0.11485460
## total_files_compatibility_structural 0.11454466
## total_bytes_no_data_compatibility_structural 0.10934267
## total_bytes_test_cases_compatibility_structural 0.10841887
## pct_bytes_no_data_compatibility_structural 0.10402379
## pct_files_no_data_compatibility_structural 0.09888705
## commit_span_days -0.09499199
## pct_months_with_commits 0.09415847
## pct_months_new_files_added 0.09160881
## mean_lines_code_and_comment_Ruby -0.08695453
## mean_lines_code_Ruby -0.08556228
## mean_bytes_Ruby -0.08438590
## mean_lines_code_Cpp 0.07873498
## mean_lines_code_and_comment_Cpp 0.07753349
## mean_bytes_Cpp 0.07226755
## PC 27
## coef
## paper_authors_female -0.2150938
## paper_authors -0.2135144
## paper_authors_male -0.2112017
## paper_authors_no_gender -0.1786439
## total_files_compatibility_structural 0.1767714
## total_bytes_no_data_compatibility_structural 0.1753555
## total_bytes_test_cases_compatibility_structural 0.1653538
## pct_bytes_no_data_compatibility_structural 0.1560090
## pct_files_no_data_compatibility_structural 0.1441295
## mean_lines_code_and_comment_Ruby -0.1392944
## mean_lines_code_Ruby -0.1381617
## mean_bytes_Ruby -0.1371632
## mean_lines_code_C -0.1238222
## mean_lines_code_and_comment_C -0.1213037
## mean_bytes_C -0.1203016
## mean_lines_code_and_comment_SQL -0.1195893
## mean_bytes_per_line_code_and_comment_Python 0.1162541
## mean_lines_code_SQL -0.1100360
## mean_bytes_SQL -0.1099730
## pct_bytes_no_data_SQL -0.1077019
## PC 28
## coef
## total_files_functional_pure 0.37126972
## total_bytes_no_data_functional_pure 0.36867341
## pct_files_no_data_functional_pure 0.32770144
## total_bytes_test_cases_functional_pure 0.29437131
## pct_bytes_no_data_functional_pure 0.25945570
## total_files_compatibility_structural -0.16767811
## total_bytes_test_cases_compatibility_structural -0.16187075
## total_bytes_no_data_compatibility_structural -0.15292638
## pct_bytes_no_data_compatibility_structural -0.14724642
## pct_files_no_data_compatibility_structural -0.13789560
## mean_lines_code_and_comment_Ruby -0.10692404
## mean_lines_code_Ruby -0.10674689
## mean_bytes_Ruby -0.10369559
## total_bytes_test_cases_compatibility_duck -0.09339405
## mean_bytes_C 0.08865648
## mean_lines_code_and_comment_C 0.08778915
## mean_lines_code_C 0.08536127
## num_files_C -0.07824557
## consecutive_months_no_commits 0.07733337
## total_bytes_test_cases_type_system_dynamic -0.07671748
## PC 29
## coef
## total_files_compatibility_structural -0.2291810
## total_bytes_no_data_compatibility_structural -0.2135204
## total_files_functional_pure -0.2124434
## total_bytes_test_cases_compatibility_structural -0.2108957
## total_bytes_no_data_functional_pure -0.2071883
## pct_bytes_no_data_compatibility_structural -0.1982646
## pct_files_no_data_compatibility_structural -0.1914380
## pct_files_no_data_functional_pure -0.1878174
## total_bytes_test_cases_functional_pure -0.1584035
## pct_bytes_no_data_functional_pure -0.1508498
## pct_bytes_test_cases_type_system_dynamic 0.1418416
## mean_bytes_SQL -0.1344065
## pct_bytes_test_cases_interpreted 0.1331107
## mean_lines_code_SQL -0.1265889
## mean_lines_code_and_comment_SQL -0.1226090
## num_files_C -0.1134522
## pct_bytes_no_data_Bourne_Again_Shell 0.1025358
## total_size_test_cases_no_data -0.1023627
## mean_bytes_per_line_code_and_comment_Bourne_Again_Shell 0.1016824
## pct_bytes_test_cases_compatibility_duck 0.1001677
## PC 30
## coef
## mean_lines_code_and_comment_Ruby 0.31116030
## mean_lines_code_Ruby 0.31070842
## mean_bytes_Ruby 0.30655430
## mean_bytes_SQL -0.18674609
## mean_lines_code_and_comment_SQL -0.17597570
## pct_bytes_no_data_SQL -0.17490493
## mean_bytes_per_line_code_and_comment_Ruby 0.16388850
## mean_lines_code_SQL -0.16339950
## pct_lines_comment_SQL -0.14858778
## mean_bytes_per_line_code_and_comment_SQL -0.11789275
## total_bytes_test_cases_declarative -0.11298883
## pct_bytes_no_data_Ruby 0.11051518
## paper_authors 0.10821138
## num_files_Ruby -0.10659311
## paper_authors_male 0.10636621
## paper_authors_female 0.10157874
## paper_authors_no_gender 0.10116686
## pct_bytes_no_data_declarative -0.09425818
## total_lines_of_code_Ruby -0.09217090
## total_lines_code_and_comment_Ruby -0.09170198
## PC 31
## coef
## total_lines_code_and_comment_R -0.2071848
## total_lines_of_code_R -0.2049381
## bytes_R -0.1949590
## total_lines_comment_R -0.1829765
## total_bytes_test_cases_functional_impure -0.1281493
## total_lines_test_cases_no_data -0.1273825
## total_bytes_test_cases_procedural -0.1234750
## mean_bytes_C -0.1201605
## pct_months_new_files_added -0.1198662
## mean_lines_code_and_comment_C -0.1192760
## consecutive_months_no_new_files_added 0.1181847
## mean_lines_code_C -0.1170970
## total_size_test_cases_no_data -0.1162634
## total_bytes_no_data_compatibility_structural 0.1153091
## mean_bytes_per_line_code_and_comment_R 0.1149672
## pct_months_with_commits -0.1130381
## max_lines_code_R -0.1129662
## total_bytes_test_cases_object_oriented -0.1120389
## pct_bytes_no_data_R 0.1113223
## max_lines_code_and_comment_R -0.1097307
## PC 32
## coef
## pct_months_new_files_added -0.29820616
## consecutive_months_no_commits 0.29455420
## pct_months_with_commits -0.29369018
## consecutive_months_no_new_files_added 0.27732772
## mean_lines_code_Cpp -0.18454415
## mean_lines_code_and_comment_Cpp -0.18248042
## mean_bytes_Cpp -0.18205409
## commit_span_days 0.17487845
## mean_bytes_Ruby -0.12902264
## mean_lines_code_Ruby -0.12649006
## mean_lines_code_and_comment_Ruby -0.12469452
## mean_commits_per_month -0.10547773
## total_lines_code_and_comment_R 0.09845219
## total_lines_of_code_R 0.09418940
## total_lines_comment_R 0.09312011
## paper_authors_male 0.09188496
## bytes_R 0.09085796
## max_lines_code_R 0.08890235
## paper_authors 0.08872460
## max_lines_code_Cpp -0.08780826
## PC 33
## coef
## mean_lines_code_Cpp 0.2423072
## mean_lines_code_and_comment_Cpp 0.2393224
## mean_bytes_Cpp 0.2359273
## pct_months_new_files_added -0.1869989
## pct_months_with_commits -0.1788101
## consecutive_months_no_commits 0.1757697
## consecutive_months_no_new_files_added 0.1741110
## mean_bytes_per_line_code_and_comment_Bourne_Again_Shell 0.1296733
## pct_lines_comment_Bourne_Again_Shell 0.1210644
## mean_commits_per_month -0.1202588
## total_bytes_test_cases_type_system_safe 0.1159793
## pct_bytes_test_cases_compatibility_duck 0.1148259
## total_bytes_test_cases_compatibility_duck 0.1148251
## max_lines_code_Cpp 0.1109134
## max_lines_code_and_comment_C_Cpp_Header -0.1080308
## mean_lines_code_C_Cpp_Header -0.1076116
## mean_lines_code_and_comment_C_Cpp_Header -0.1054239
## mean_bytes_C_Cpp_Header -0.1031514
## total_bytes_test_cases_interpreted 0.1031040
## total_bytes_test_cases_type_system_dynamic 0.1028638
## PC 34
## coef
## commits 0.1760309
## commits_no_gender 0.1602065
## mean_lines_code_and_comment_Java -0.1564970
## mean_lines_code_Java -0.1534757
## mean_bytes_Java -0.1527743
## mean_lines_code_C_Cpp_Header -0.1465496
## mean_bytes_C_Cpp_Header -0.1449982
## commit_span_days 0.1443712
## mean_commits_per_month 0.1418592
## mean_lines_code_and_comment_C_Cpp_Header -0.1416119
## max_lines_code_and_comment_C_Cpp_Header -0.1370157
## commits_male 0.1352525
## commits_female 0.1338220
## forks_count -0.1314032
## pct_bytes_test_cases_type_system_static -0.1308207
## watchers_count -0.1263181
## stargazers_count -0.1263181
## max_lines_code_C_Cpp_Header -0.1262598
## mean_day_new_files_added 0.1246620
## mean_lines_code_and_comment_C -0.1211235
## PC 35
## coef
## mean_lines_code_Java -0.2493039
## mean_lines_code_and_comment_Java -0.2478298
## mean_bytes_Java -0.2383029
## total_bytes_test_cases_compatibility_duck -0.1594534
## mean_bytes_Ruby 0.1441586
## pct_months_with_commits -0.1427153
## mean_lines_code_Ruby 0.1387760
## mean_lines_code_and_comment_Ruby 0.1374266
## pct_bytes_test_cases_compatibility_nominative 0.1352331
## pct_months_new_files_added -0.1286739
## mean_bytes_PHP -0.1184583
## pct_bytes_test_cases_type_system_static 0.1184508
## mean_lines_code_PHP -0.1184259
## consecutive_months_no_commits 0.1162435
## mean_lines_code_and_comment_PHP -0.1118598
## pct_bytes_test_cases_type_system_unsafe 0.1110502
## mean_bytes_Python -0.1090660
## max_lines_code_Java -0.1048985
## total_bytes_test_cases_type_system_dynamic -0.1047449
## mean_commits_per_month -0.1030173
## PC 36
## coef
## mean_bytes_C -0.1970224
## mean_lines_code_and_comment_C -0.1946505
## mean_lines_code_C_Cpp_Header 0.1928191
## mean_lines_code_and_comment_C_Cpp_Header 0.1902177
## num_citations_pmc 0.1891453
## mean_bytes_C_Cpp_Header 0.1858760
## mean_lines_code_C -0.1855156
## num_citations_per_week_pmc_minus_2_years 0.1466377
## pct_bytes_test_cases_type_system_static -0.1417717
## mean_bytes_per_line_code_and_comment_Bourne_Again_Shell -0.1371336
## mean_lines_code_and_comment_PHP -0.1343874
## mean_lines_code_PHP -0.1334128
## mean_bytes_PHP -0.1295642
## pct_bytes_no_data_compatibility_structural -0.1293539
## num_files_Bourne_Again_Shell -0.1194878
## pct_files_no_data_compatibility_structural -0.1183791
## pct_bytes_no_data_C_Cpp_Header 0.1150331
## pct_bytes_test_cases_compatibility_nominative -0.1137080
## pct_bytes_test_cases_type_system_dynamic 0.1136779
## pct_bytes_test_cases_interpreted 0.1083133
## PC 37
## coef
## mean_bytes_PHP -0.33411412
## mean_lines_code_PHP -0.33021284
## mean_lines_code_and_comment_PHP -0.31523397
## mean_lines_code_Java 0.18100889
## mean_lines_code_and_comment_Java 0.17832566
## mean_bytes_Java 0.17426452
## total_lines_comment_PHP 0.13235263
## num_files_PHP 0.12639347
## total_bytes_test_cases_compatibility_duck -0.12463262
## total_lines_code_and_comment_PHP 0.11989389
## total_lines_of_code_PHP 0.11362213
## mean_bytes_per_line_code_and_comment_PHP -0.10757471
## mean_bytes_C_Cpp_Header -0.10158674
## pct_lines_comment_PHP -0.10075777
## mean_lines_code_C_Cpp_Header -0.09594984
## mean_lines_code_Python 0.09513806
## commits_no_gender 0.09454034
## mean_lines_code_and_comment_Python 0.09382838
## commits 0.09204324
## pct_bytes_test_cases_compatibility_duck -0.09097073
## PC 38
## coef
## mean_lines_code_PHP 0.2112546
## mean_bytes_PHP 0.2099476
## mean_lines_code_and_comment_PHP 0.2076452
## mean_lines_code_Cpp 0.1582911
## mean_lines_code_and_comment_Cpp 0.1553876
## mean_bytes_Cpp 0.1542720
## num_files_Bourne_Again_Shell -0.1521207
## mean_bytes_per_line_code_and_comment_Bourne_Again_Shell -0.1399263
## mean_lines_code_and_comment_JavaScript -0.1316971
## mean_lines_code_JavaScript -0.1304612
## total_bytes_test_cases_type_system_dynamic -0.1278751
## total_bytes_test_cases_interpreted -0.1260391
## mean_bytes_JavaScript -0.1223414
## pct_bytes_no_data_Bourne_Again_Shell -0.1181020
## total_lines_of_code_C 0.1171636
## total_lines_code_and_comment_C 0.1154156
## num_citations_per_week_pmc_minus_2_years 0.1151903
## bytes_C 0.1136074
## pct_lines_comment_no_data -0.1110109
## pct_lines_comment -0.1084579
## PC 39
## coef
## mean_lines_code_C_Cpp_Header -0.2103984
## mean_bytes_C_Cpp_Header -0.2099714
## mean_lines_code_and_comment_C_Cpp_Header -0.1945452
## mean_bytes_per_line_code_and_comment_Bourne_Again_Shell -0.1705914
## max_lines_code_C_Cpp_Header -0.1556282
## max_lines_code_and_comment_C_Cpp_Header -0.1469572
## pct_bytes_no_data_Bourne_Again_Shell -0.1461424
## pct_bytes_no_data_Cpp 0.1425066
## mean_bytes_Python 0.1395930
## mean_lines_code_Python 0.1351665
## mean_lines_code_and_comment_Python 0.1350222
## pct_lines_comment_Bourne_Again_Shell -0.1344011
## mean_bytes_R 0.1208725
## pct_lines_comment_C_Cpp_Header 0.1198925
## pct_bytes_no_data_type_system_unsafe 0.1146158
## mean_lines_code_R 0.1138313
## num_files_Bourne_Again_Shell -0.1106436
## mean_lines_code_and_comment_R 0.1048123
## mean_bytes_C -0.1044362
## mean_lines_code_C -0.1023463
## PC 40
## coef
## mean_lines_code_R 0.34099347
## mean_bytes_R 0.31644484
## mean_lines_code_and_comment_R 0.30744375
## num_files_R -0.23243282
## pct_lines_comment_no_data -0.21141378
## pct_lines_comment -0.20797266
## max_lines_code_R 0.16140220
## pct_bytes_test_cases_compatibility_structural 0.15877192
## total_lines_comment_R -0.14386823
## max_lines_code_and_comment_R 0.14217885
## pct_bytes_test_cases_logic 0.13051961
## pct_bytes_test_cases_declarative 0.12579660
## pct_lines_comment_R -0.12427783
## total_lines_code_and_comment_R -0.11050272
## bytes_R -0.10942105
## mean_lines_code_C_Cpp_Header 0.10803861
## pct_bytes_test_cases_imperative 0.10791887
## mean_bytes_C_Cpp_Header 0.10315820
## num_files_Bourne_Shell -0.10187908
## mean_lines_code_and_comment_C_Cpp_Header 0.09681007
## PC 41
## coef
## pct_bytes_no_data_compatibility_structural 0.2603736
## pct_files_no_data_compatibility_structural 0.2459516
## pct_bytes_no_data_m4 0.1908717
## num_files_C -0.1872596
## mean_lines_code_Java 0.1666254
## total_lines_of_code_C -0.1612316
## mean_lines_code_and_comment_Java 0.1581595
## mean_bytes_C 0.1547738
## mean_lines_code_and_comment_C 0.1532020
## mean_bytes_Java 0.1487645
## mean_lines_code_C 0.1479225
## num_files_Bourne_Shell -0.1449386
## total_lines_code_and_comment_C -0.1405180
## bytes_C -0.1309369
## mean_commit_message_len -0.1282991
## median_commit_message_len -0.1214323
## total_bytes_no_data_compatibility_structural 0.1189608
## pct_lines_comment_no_data -0.1151939
## num_files_Bourne_Again_Shell -0.1118813
## mean_lines_code_m4 0.1096756
## PC 42
## coef
## pct_bytes_test_cases_logic 0.44635461
## pct_bytes_test_cases_declarative 0.40151918
## pct_bytes_test_cases_functional_pure 0.29542945
## num_citations_pmc 0.25510700
## num_citations_per_week_pmc_minus_2_years 0.23014305
## total_bytes_test_cases_declarative 0.16566858
## num_non_committing_authors 0.14222673
## pct_bytes_test_cases_functional_impure -0.10712140
## mean_bytes_per_line_code_and_comment_Bourne_Again_Shell 0.10124580
## pct_bytes_no_data_Bourne_Again_Shell 0.09932908
## num_files_Bourne_Again_Shell 0.09865289
## mean_commit_message_len 0.09491972
## pct_bytes_test_cases_compatibility_duck -0.09410354
## total_bytes_test_cases_logic 0.09200106
## pct_lines_comment_SQL -0.08895700
## pct_bytes_no_data_logic -0.08815512
## pct_files_no_data_logic -0.08577923
## watchers_count -0.08384819
## stargazers_count -0.08384819
## pct_bytes_test_cases_type_system_dynamic -0.07963399
## PC 43
## coef
## shannon_commit_author_gender 0.2931185
## shannon_commits_gender 0.2788318
## mean_bytes_per_line_code_and_comment_Bourne_Shell 0.1991365
## mean_bytes_per_line_code_and_comment_Bourne_Again_Shell -0.1712476
## num_files_Bourne_Again_Shell -0.1644507
## mean_bytes_Bourne_Shell 0.1642314
## mean_bytes_per_line_code_and_comment_no_data 0.1598235
## pct_lines_comment_Bourne_Again_Shell -0.1570568
## pct_bytes_test_cases_logic 0.1515720
## pct_bytes_no_data_Bourne_Shell 0.1493897
## commits_female 0.1384271
## commit_authors_female 0.1381689
## mean_file_size 0.1373466
## pct_lines_comment_Cpp 0.1308229
## pct_bytes_no_data_Bourne_Again_Shell -0.1200563
## pct_bytes_test_cases_functional_pure 0.1195572
## mean_lines_code_Bourne_Shell 0.1188103
## mean_lines_code_and_comment_Bourne_Shell 0.1173803
## pct_bytes_test_cases_declarative 0.1157019
## mean_bytes_per_line_code_and_comment 0.1131613
# Function to find PCs associated with a column/vector of repo measurements
associated_pcs <- function(repo_obs) {
scores <- pca_res$scores
# Initialize return df
if(is.logical(repo_obs)) {
df <- data.frame(PC = integer(), pval = numeric(), median_true = numeric(), median_false = numeric())
} else {
if(is.numeric(repo_obs)) {
df <- data.frame(PC = integer(), pval = numeric(), estimate = numeric())
}
}
# Look at scores for each PC
for(j in 1:ncomp_keep) {
data <- data.frame(repo_obs = repo_obs, scores = scores[,j])
# Binary repo feature
if(is.logical(repo_obs)) {
res <- wilcox.test(scores ~ repo_obs, data = data)
p <- res$p.value
if(p < 0.0001) {
df <- rbind(df, data.frame(PC = j, pval = p, median_true = median(data[which(data$repo_obs), "scores"]),
median_false = median(data[which(!data$repo_obs), "scores"])))
}
} else {
# Numeric repo feature
if(is.numeric(repo_obs)) {
res <- lm(scores ~ repo_obs, data = data)
p <- summary(res)$coefficients["repo_obs",4]
estimate <- summary(res)$coefficients["repo_obs",1]
if(p < 0.0001) {
df <- rbind(df, data.frame(PC = j, pval = p, estimate = estimate))
}
} else {
stop("Data type not supported")
}
}
}
df %>% arrange(pval)
}
# Display some associations
associated_pcs(repo_data_all$is_high_profile)
## PC pval median_true median_false
## 1 1 1.876491e-12 -7.015125 1.767586075
## 2 10 5.817301e-12 -3.733818 0.049158817
## 3 42 1.231610e-11 2.548866 0.028518650
## 4 6 4.466402e-10 3.488499 -0.411864116
## 5 7 3.935438e-09 -3.576013 0.188884357
## 6 36 3.449137e-08 3.861506 0.026719690
## 7 16 1.935552e-07 -2.953724 -0.102268413
## 8 23 3.194646e-07 1.620941 -0.008419759
## 9 3 4.824076e-07 -3.076207 0.271779893
## 10 28 5.093561e-07 1.061539 -0.033890993
## 11 13 5.630509e-06 -3.090234 -0.112774488
## 12 17 5.742936e-06 2.250737 0.162178664
## 13 9 2.206802e-05 1.657870 0.597585522
## 14 38 2.932533e-05 1.485005 0.014469707
associated_pcs(repo_data_all$total_file_size)
## PC pval estimate
## 1 1 2.813028e-234 -7.267761e-07
## 2 4 3.536057e-22 1.306006e-07
## 3 16 2.543945e-12 6.642074e-08
## 4 3 1.621018e-09 -8.756602e-08
## 5 6 1.763092e-08 -7.311027e-08
## 6 15 8.796145e-08 -5.259307e-08
## 7 17 3.764442e-07 4.664127e-08
## 8 2 1.504949e-05 -7.665350e-08
## 9 7 4.014657e-05 5.158842e-08
## 10 19 9.586471e-05 -3.428360e-08
associated_pcs(repo_data_all$num_citations_pmc)
## PC pval estimate
## 1 42 8.127347e-66 0.0009613828
## 2 36 2.505215e-39 0.0009506985
## 3 6 8.543879e-14 0.0011396090
## 4 33 3.668741e-13 0.0005642248
## 5 1 4.946870e-12 -0.0020956147
## 6 10 4.274431e-11 -0.0008689062
## 7 38 8.660794e-10 0.0004264397
## 8 20 3.288712e-09 0.0005980542
## 9 16 2.115919e-08 -0.0006315172
## 10 43 1.432503e-07 -0.0003197448
## 11 7 3.437949e-07 -0.0007511429
## 12 13 2.885783e-06 -0.0005741770
## 13 3 3.410662e-06 -0.0008033663
## 14 17 7.810375e-05 0.0004337214
associated_pcs(repo_data_all$stargazers_count)
## PC pval estimate
## 1 24 2.062986e-68 -0.0028663220
## 2 17 1.069134e-50 0.0030206950
## 3 20 3.943133e-39 0.0024483312
## 4 19 3.872706e-29 0.0021909354
## 5 14 4.399826e-24 -0.0023263012
## 6 10 2.088639e-23 -0.0024659401
## 7 13 9.140973e-23 -0.0022693015
## 8 34 3.254257e-19 -0.0012780708
## 9 21 1.330528e-16 0.0014918342
## 10 16 3.462218e-15 -0.0016842740
## 11 2 6.832548e-15 -0.0030984095
## 12 6 4.770214e-12 0.0020206905
## 13 7 2.723023e-10 -0.0017845240
## 14 35 4.081460e-09 0.0008273181
## 15 1 1.501269e-08 -0.0032794976
## 16 42 7.372890e-07 -0.0005974368
## 17 11 3.608204e-05 0.0010061961
associated_pcs(repo_data_all$watchers_count)
## PC pval estimate
## 1 24 2.062986e-68 -0.0028663220
## 2 17 1.069134e-50 0.0030206950
## 3 20 3.943133e-39 0.0024483312
## 4 19 3.872706e-29 0.0021909354
## 5 14 4.399826e-24 -0.0023263012
## 6 10 2.088639e-23 -0.0024659401
## 7 13 9.140973e-23 -0.0022693015
## 8 34 3.254257e-19 -0.0012780708
## 9 21 1.330528e-16 0.0014918342
## 10 16 3.462218e-15 -0.0016842740
## 11 2 6.832548e-15 -0.0030984095
## 12 6 4.770214e-12 0.0020206905
## 13 7 2.723023e-10 -0.0017845240
## 14 35 4.081460e-09 0.0008273181
## 15 1 1.501269e-08 -0.0032794976
## 16 42 7.372890e-07 -0.0005974368
## 17 11 3.608204e-05 0.0010061961
associated_pcs(repo_data_all$first_author_gender == "female")
## PC pval median_true median_false
## 1 43 5.946032e-08 0.1217638 -0.1555944
## 2 1 3.155578e-05 2.1675745 1.5531409
associated_pcs(repo_data_all$last_author_gender == "female")
## [1] PC pval median_true median_false
## <0 rows> (or 0-length row.names)
associated_pcs(repo_data_all$shannon_commit_author_gender)
## PC pval estimate
## 1 43 3.156426e-50 4.560701
## 2 34 1.327683e-09 1.769080
## 3 41 2.155488e-09 -1.679837
# Make biplots
for(cp in 1:ncomp_keep) {
plt <- ggbiplot(pca_res, obs.scale = 1, var.scale = 1, choices = c(cp, cp+1),
groups = repo_data_all$is_high_profile, ellipse = TRUE,
var.axes = FALSE, alpha = 0.2) +
scale_color_discrete(name = '') +
theme(legend.direction = 'horizontal', legend.position = 'top')
print(plt)
}
sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] jsonlite_1.5 factoextra_1.0.5 ggbiplot_0.55 scales_0.5.0
## [5] plyr_1.8.4 ggplot2_2.2.1 dplyr_0.7.4
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.14 ggpubr_0.1.6 knitr_1.18 bindr_0.1
## [5] magrittr_1.5 munsell_0.4.3 colorspace_1.3-2 R6_2.2.2
## [9] rlang_0.1.6 stringr_1.2.0 tools_3.4.3 gtable_0.2.0
## [13] htmltools_0.3.6 lazyeval_0.2.1 yaml_2.1.16 assertthat_0.2.0
## [17] rprojroot_1.3-2 digest_0.6.13 tibble_1.4.1 bindrcpp_0.2
## [21] ggrepel_0.7.0 glue_1.2.0 evaluate_0.10.1 rmarkdown_1.8
## [25] labeling_0.3 stringi_1.1.6 compiler_3.4.3 pillar_1.0.1
## [29] backports_1.1.2 pkgconfig_2.0.1